In [1]:
from sklearn.model_selection import train_test_split
from lightgbm.sklearn import LGBMRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import pickle
import os
import plotly.express as px
import pandas as pd
import numpy as np
from scipy.special import softmax

import ibis
ibis.set_backend("duckdb")
ibis.options.interactive = True
from ibis import _
import ibis.selectors as s
import warnings
warnings.filterwarnings('ignore')

Goal¶

  • In this example, we're going to try using the disease and gene embeddings to make predictions.
  • We'll try using the Target - disease associations evidence for this.
    • The associations be broken down into different sources.
    • The goal of this demo is just to explore what happens when we make predictions using it. It's unlikely it will work without some pruning.

Prelude (defining functions for later)¶

In [2]:
def construct_databases(base_loc):
    mappings = {}
    for directory in os.listdir(base_loc):
        if directory.startswith("."):
            continue
        loc = base_loc + "/" + directory
        t = ibis.read_parquet(loc)
        mappings["t_" + directory] = t
    return mappings

# Load parquet databases into local variables
locals().update(construct_databases("../../../data/open_targets/"))

def construct_scatterplot(df, mapper, hover_name, color=None, hover_data=None, size=2, filter=None):
    embeddings = mapper.embedding_.T
    df["x"], df["y"], df["z"] = embeddings
        
    fig = px.scatter_3d(df[filter(df)] if filter is not None else df, 
                        x="x", y="y", z="z", color=color,
                        hover_name=hover_name, hover_data=hover_data)
    fig.update_layout(margin=dict(l=0, r=0, t=0, b=0))
    fig.update_traces(marker=dict(size=size))
    return fig.show()

Loading Stored Models / Data¶

In [3]:
gene_df = pickle.load(open("models/gene_df.sav", 'rb'))
disease_df = pickle.load(open("models/disease_df.sav", 'rb'))

gene_desc_mapper = pickle.load(open("models/gene_desc_mapper.sav", 'rb'))
gene_go_mapper = pickle.load(open("models/gene_go_mapper.sav", 'rb'))
disease_expression_mapper = pickle.load(open("models/disease_expression_mapper.sav", 'rb'))
disease_desc_mapper = pickle.load(open("models/disease_desc_mapper.sav", 'rb'))
gene_nucleotide_mapper = pickle.load(open("models/gene_nucleotide_mapper.sav", 'rb'))
gene_protein_mapper = pickle.load(open("models/gene_protein_mapper.sav", 'rb'))

Loading dataframes into Ibis memory tables¶

In [4]:
t_gene_df = (ibis.memtable(gene_df)
             .relabel({"id":"targetId", "index":"geneIndex"})) # quck fixes to harmonize
t_disease_df = (ibis.memtable(disease_df)
             .relabel({"index":"diseaseIndex"})) # quick fixe to harmonize
In [5]:
t_gene_df
Out[5]:
┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┓
┃ targetId        ┃ approvedName                                            ┃ go_desc                                                                          ┃ functionDescriptions                                                             ┃ location                     ┃ truncDesc                                          ┃ nucleotide                                                                       ┃ contig ┃ protein                                                                          ┃ geneIndex ┃ x         ┃ y        ┃ z        ┃
┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━┩
│ string          │ string                                                  │ string                                                                           │ string                                                                           │ string                       │ string                                             │ string                                                                           │ string │ string                                                                           │ int64     │ float32   │ float32  │ float32  │
├─────────────────┼─────────────────────────────────────────────────────────┼──────────────────────────────────────────────────────────────────────────────────┼──────────────────────────────────────────────────────────────────────────────────┼──────────────────────────────┼────────────────────────────────────────────────────┼──────────────────────────────────────────────────────────────────────────────────┼────────┼──────────────────────────────────────────────────────────────────────────────────┼───────────┼───────────┼──────────┼──────────┤
│ ENSG00000059588 │ TAR (HIV-1) RNA binding protein 1                       │ tRNA (guanine) methyltransferase activity regulation of transcription by RNA po… │ Probable S-adenosyl-L-methionine-dependent methyltransferase which methylates R… │ Nuclear speckles             │ Probable S-adenosyl-L-methionine-dependent methylt │ ATGGAGTGGGTGCTCGCGGAAGCGCTGCTCTCGCAGAGCCGGGACCCCCGGGCCCTGCTTGGGGCGCTGTGCCAAGGGG… │ 1      │ MEWVLAEALLSQSRDPRALLGALCQGEASAERVETLRFLLQRLEDEEARGSGGAGALPEAAREVAAGYLVPLLRSLRGR… │         0 │ 10.431715 │ 4.211885 │ 9.016659 │
│ ENSG00000072071 │ adhesion G protein-coupled receptor L1                  │ latrotoxin receptor activity G protein-coupled receptor signaling pathway plasm… │ Calcium-independent receptor of high affinity for alpha- latrotoxin, an excitat… │ Cell membrane                │ Calcium-independent receptor of high affinity for  │ TCTTTTTTTTTTTTTTTCCTAATTTTTGGTCGGCGGCGGTGCTGGGCCAGGGGAAGGAAGGGACACGGAGGCCGCCCTC… │ 19     │ SFFFFFLIFGRRRCWARGRKGHGGRPRPATSYPLPPSPGSGRCAGRGARVRRAAGETRWADPREALDRLVVQAVVPAR*… │         1 │  9.038093 │ 6.293142 │ 4.004947 │
│ ENSG00000073536 │ notchless homolog 1                                     │ skeletal system morphogenesis positive regulation of canonical Wnt signaling pa… │ Plays a role in regulating Notch activity. Plays a role in regulating the expre… │ Nucleus                      │ Plays a role in regulating Notch activity. Plays a │ GGACGCAGGATGGCGGCAGCAGTGCCGGTGGGTGTGCGTGGATGGGGGCGGGGGGCGTCGCCGCGGGGCGCTAGGGCCC… │ 17     │ GRRMAAAVPVGVRGWGRGASPRGARARVPEA*ERRSGALLGGRPD*LPPHLLAQDEAVARDVQRLLVQFQDEGGQLLGS… │         2 │  8.673260 │ 5.322880 │ 6.614601 │
│ ENSG00000075290 │ Wnt family member 8B                                    │ canonical Wnt signaling pathway signal transduction nervous system development … │ Ligand for members of the frizzled family of seven transmembrane receptors. May… │ Secreted                     │ Ligand for members of the frizzled family of seven │ CGCTTACACACCAAGGAAGTTGGGCTTTGAGAATTCCATCCCACTGGCACTGAGGAGAATATTTCTCCGTCTTGCTTAC… │ 10     │ RLHTKEVGL*EFHPTGTEENISPSCLPISQFFGIFSSCYSRGLCFFQSLLCTSVFSPVSSNSATAGR*TIS**LVQRLT… │         3 │  7.238286 │ 5.980227 │ 6.065460 │
│ ENSG00000083454 │ purinergic receptor P2X 5                               │ positive regulation of calcium ion transport into cytosol positive regulation o… │ Receptor for ATP that acts as a ligand-gated ion channel.                        │ Membrane                     │ Receptor for ATP that acts as a ligand-gated ion c │ CGGGCGCCGGGCGCGCAGGGACCGAGGGACCGAGTGCTCCCCATGAGCGCACGTGGGCCGGGCGGTCCGCAAGCCCGGC… │ 17     │ RAPGAQGPRDRVLPMSARGPGGPQARLRARHGAGGLQGALPVAVRLQDREVCHRQEQEGGPAVPAAAGLHPGVPGRMGV… │         4 │  9.326583 │ 6.167222 │ 3.889343 │
│ ENSG00000083782 │ epiphycan                                               │ articular cartilage development glycosaminoglycan binding bone development extr… │ May have a role in bone formation and also in establishing the ordered structur… │ Secreted                     │ May have a role in bone formation and also in esta │ ACAGCCATTGGTCAGGGGCAAATACCACTAGCTCTGCATCCTCAGTCACTTTGTGCCATTTCATCAGGTCAGAGCCAAA… │ 12     │ TAIGQGQIPLALHPQSLCAISSGQSQRKA*KMKTLAGLVLGLVIFDAAVTAPTLESINYDSETYDATLEDLDNLYNYEN… │         5 │  9.566043 │ 3.741266 │ 9.382458 │
│ ENSG00000087087 │ serrate, RNA effector molecule                          │ primary miRNA processing regulation of DNA-templated transcription DNA binding … │ Acts as a mediator between the cap-binding complex (CBC) and the primary microR… │ Nucleus                      │ Acts as a mediator between the cap-binding complex │ GTGCCTCGGAGGCGTGGGTGACGCAGGCGCAGCGCGGGCTGCGCGCGCTACTGCCCATCCCCGGTTGTCCCACTTTTGT… │ 7      │ VPRRRG*RRRSAGCARYCPSPVVPLLFASLRPSTQELRLRLALLEVLVARPRPRSP*NLARPSASPTAAAAPRPPQTVP… │         6 │  6.632429 │ 6.814382 │ 5.678242 │
│ ENSG00000087502 │ ERGIC and golgi 2                                       │ retrograde vesicle-mediated transport, Golgi to endoplasmic reticulum transport… │ Possible role in transport between endoplasmic reticulum and Golgi. .            │ Endoplasmic reticulum        │ Possible role in transport between endoplasmic ret │ TCTGTGAAACATGGCGGTAGGCTGGGACCATAACACAAGCATGACTATATGAAGGAAGAGGAAGGTTTTCCTGAAGATG… │ 12     │ SVKHGGRLGP*HKHDYMKEEEGFPEDEATESEKNFKFGKRVGCLSEGS*ELCRDFSQWRYSFSNSIYNYGFINHNGILS… │         7 │  7.251029 │ 1.868455 │ 9.219758 │
│ ENSG00000092201 │ SPT16 homolog, facilitates chromatin remodeling subunit │ nucleoplasm nucleoplasm FACT complex RNA binding nucleoplasm nucleoplasm transc… │ Component of the FACT complex, a general chromatin factor that acts to reorgani… │ Nucleus                      │ Component of the FACT complex, a general chromatin │ GGCAGACCGTCACGTGACGACGTCGATTCGCGTGCGGCAGTGGCGAAGTTGACAAACCCCGCGAAAATCGACTCTTTGC… │ 14     │ GRPSRDDVDSRAAVAKLTNPAKIDSLHRTFC*FSLVFLSLFPPSIRKRVGKKQNKQTNKKKT*RCWDPEAERASLRSIL… │         8 │ 10.341167 │ 4.429931 │ 9.074904 │
│ ENSG00000102078 │ solute carrier family 25 member 14                      │ plasma membrane mitochondrial inner membrane mitochondrial inner membrane mitoc… │ Participates in the mitochondrial proton leak measured in brain mitochondria.    │ Mitochondrion inner membrane │ Participates in the mitochondrial proton leak meas │ GTTGGTTTCAATGCTTCCGGGTTGGCGCTGCAGTGGCGTTTCCGACTGTGGGAGCCTCAGCTTCCCAGTCGTCCGATGA… │ Other  │ VGFNASGLALQWRFRLWEPQLPSRPMSPSS*VPSLSFTLLASVVLLLRLNPASSTPLGGRLLQAPPFSPRARSDS*GTG… │         9 │  9.445682 │ 3.893537 │ 8.485611 │
│ …               │ …                                                       │ …                                                                                │ …                                                                                │ …                            │ …                                                  │ …                                                                                │ …      │ …                                                                                │         … │         … │        … │        … │
└─────────────────┴─────────────────────────────────────────────────────────┴──────────────────────────────────────────────────────────────────────────────────┴──────────────────────────────────────────────────────────────────────────────────┴──────────────────────────────┴────────────────────────────────────────────────────┴──────────────────────────────────────────────────────────────────────────────────┴────────┴──────────────────────────────────────────────────────────────────────────────────┴───────────┴───────────┴──────────┴──────────┘
In [6]:
t_disease_df
Out[6]:
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┓
┃ name                                                 ┃ diseaseId   ┃ desc                                                                             ┃ most_expressed_in_system ┃ musculoskeletal_system ┃ integumental_system ┃ circulatory_system ┃ renal_system ┃ connective_tissue ┃ hematopoietic_system ┃ hemolymphoid_system ┃ digestive_system ┃ respiratory_system ┃ external_soft_tissue_zone ┃ nervous_system ┃ immune_system ┃ anatomical_junction ┃ endocrine_system ┃ anatomical_wall ┃ reproductive_system ┃ diseaseIndex ┃ x        ┃ y        ┃ z        ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━┩
│ string                                               │ string      │ string                                                                           │ string                   │ float64                │ float64             │ float64            │ float64      │ float64           │ float64              │ float64             │ float64          │ float64            │ float64                   │ float64        │ float64       │ float64             │ float64          │ float64         │ float64             │ int64        │ float32  │ float32  │ float32  │
├──────────────────────────────────────────────────────┼─────────────┼──────────────────────────────────────────────────────────────────────────────────┼──────────────────────────┼────────────────────────┼─────────────────────┼────────────────────┼──────────────┼───────────────────┼──────────────────────┼─────────────────────┼──────────────────┼────────────────────┼───────────────────────────┼────────────────┼───────────────┼─────────────────────┼──────────────────┼─────────────────┼─────────────────────┼──────────────┼──────────┼──────────┼──────────┤
│ gonorrhea                                            │ DOID_7551   │ A primary bacterial infectious disease that is a sexually transmitted infection… │ connective_tissue        │              -0.096844 │           -0.124075 │          -0.269214 │    -0.183644 │          0.082471 │            -0.241006 │           -0.171879 │        -0.138369 │          -0.388618 │                 -0.267145 │      -0.324140 │     -0.419989 │           -0.315517 │        -0.397776 │       -0.165230 │           -0.310319 │            0 │ 9.960172 │ 7.799116 │ 3.600109 │
│ respiratory quotient                                 │ EFO_0005189 │ The respiratory quotient (or RQ or respiratory coefficient), is a dimensionless… │ musculoskeletal_system   │               2.000000 │           -0.400000 │          -0.400000 │    -0.666667 │          0.000000 │             0.300000 │            0.000000 │        -0.352941 │          -0.500000 │                  0.000000 │      -0.800000 │      0.437500 │           -1.000000 │        -0.333333 │        0.000000 │            0.083333 │            1 │ 9.965736 │ 7.900253 │ 5.888873 │
│ response to silica exposure                          │ EFO_0005853 │ short or long term physiological response of an organism, eg in terms of deposi… │ connective_tissue        │              -0.500000 │            0.266667 │          -0.733333 │    -0.222222 │          0.666667 │             0.266667 │            0.571429 │         0.058824 │          -0.138889 │                 -0.333333 │      -0.750000 │      0.479167 │           -0.666667 │        -0.185185 │        0.333333 │           -0.416667 │            2 │ 9.978691 │ 6.141375 │ 0.409376 │
│ response to thiopurine                               │ EFO_0006317 │ Any process that results in a change in state or activity of a cell or an organ… │ hemolymphoid_system      │               0.250000 │            0.250000 │          -0.426389 │     0.083333 │         -0.125000 │            -0.112500 │            0.482143 │         0.209099 │           0.125000 │                 -0.375000 │      -0.402778 │      0.078125 │           -0.875000 │        -0.588542 │       -0.375000 │           -0.741667 │            3 │ 9.299581 │ 6.741986 │ 1.319081 │
│ cryptococcosis                                       │ EFO_0007229 │ An opportunistic mycosis that results_in fungal infection and has_material_basi… │ hemolymphoid_system      │              -0.274286 │           -0.309714 │          -0.405206 │    -0.312381 │         -0.468571 │            -0.230857 │           -0.031837 │        -0.155399 │          -0.220000 │                 -0.451429 │      -0.514404 │     -0.271786 │           -0.537143 │        -0.509762 │       -0.091429 │           -0.504450 │            4 │ 8.008451 │ 6.225818 │ 1.558063 │
│ Nematoda infectious disease                          │ EFO_0007391 │ Infections caused by nematode larvae which never develop into the adult stage a… │ anatomical_wall          │              -0.345930 │           -0.247674 │          -0.374354 │    -0.184109 │         -0.255814 │            -0.322674 │           -0.178571 │        -0.090544 │          -0.357558 │                 -0.325581 │      -0.578775 │     -0.430596 │           -0.430233 │        -0.452116 │        0.005814 │           -0.487350 │            5 │ 7.896668 │ 6.960781 │ 2.706551 │
│ interleukin 1 Receptor accessory protein measurement │ EFO_0008167 │ quantification of the amount of interleukin 1 Receptor accessory protein in a s… │ hematopoietic_system     │              -0.750000 │           -0.500000 │          -0.800000 │    -0.500000 │         -0.500000 │             0.300000 │           -0.357143 │        -0.676471 │          -0.750000 │                 -1.000000 │      -0.925000 │     -0.593750 │           -1.000000 │        -0.555556 │       -1.000000 │           -0.666667 │            6 │ 8.840138 │ 4.281627 │ 1.280424 │
│ interleukin 23 receptor measurement                  │ EFO_0008181 │ quantification of the amount of interleukin 23 receptor in a sample              │ hemolymphoid_system      │              -0.500000 │           -0.650000 │          -0.944444 │    -0.666667 │         -0.500000 │            -0.325000 │           -0.035714 │        -0.133272 │          -0.562500 │                 -1.000000 │      -0.805556 │     -0.312500 │           -1.000000 │        -0.315972 │       -0.750000 │           -0.354167 │            7 │ 8.678880 │ 4.615093 │ 1.183100 │
│ atypical femoral fracture                            │ EFO_0009960 │ Stress or insufficency fractures occurring in the femoral shaft, typically in r… │ hemolymphoid_system      │              -0.333333 │           -0.133333 │          -0.633333 │    -0.444444 │         -0.666667 │             0.133333 │            0.238095 │        -0.607843 │          -0.750000 │                 -0.333333 │      -0.816667 │      0.125000 │           -0.666667 │        -0.666667 │        0.000000 │           -0.750000 │            8 │ 8.702263 │ 5.653661 │ 0.854118 │
│ CD40 measurement                                     │ EFO_0010586 │ quantification of the amount of CD40 in a sample                                 │ digestive_system         │              -0.750000 │            0.500000 │          -0.400000 │     0.333333 │         -0.500000 │            -0.600000 │            0.214286 │         0.705882 │           0.458333 │                  0.500000 │      -0.875000 │     -0.468750 │           -0.500000 │        -0.111111 │        0.000000 │           -0.083333 │            9 │ 7.339328 │ 6.733737 │ 3.136575 │
│ …                                                    │ …           │ …                                                                                │ …                        │                      … │                   … │                  … │            … │                 … │                    … │                   … │                … │                  … │                         … │              … │             … │                   … │                … │               … │                   … │            … │        … │        … │        … │
└──────────────────────────────────────────────────────┴─────────────┴──────────────────────────────────────────────────────────────────────────────────┴──────────────────────────┴────────────────────────┴─────────────────────┴────────────────────┴──────────────┴───────────────────┴──────────────────────┴─────────────────────┴──────────────────┴────────────────────┴───────────────────────────┴────────────────┴───────────────┴─────────────────────┴──────────────────┴─────────────────┴─────────────────────┴──────────────┴──────────┴──────────┴──────────┘

Use overall direct association scores to create test/train set¶

In [7]:
t_samples = (t_associationByOverallDirect
 .inner_join(t_disease_df.select("diseaseId", "diseaseIndex"), "diseaseId")
 .inner_join(t_gene_df.select("targetId", "geneIndex"), "targetId")
 .filter(_.evidenceCount > 1)
 .mutate(  
     is_train = ibis.random() >= 0.2, # 80% will be for training, the other 20% for testing
     random_order = ibis.random()) # Used to shuffle the data
 .order_by(_.random_order))
t_samples
Out[7]:
┏━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━┓
┃ diseaseId     ┃ targetId        ┃ score    ┃ evidenceCount ┃ diseaseIndex ┃ geneIndex ┃ is_train ┃ random_order ┃
┡━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━┩
│ string        │ string          │ float64  │ int64         │ int64        │ int64     │ boolean  │ float64      │
├───────────────┼─────────────────┼──────────┼───────────────┼──────────────┼───────────┼──────────┼──────────────┤
│ EFO_0004340   │ ENSG00000185513 │ 0.223393 │             5 │         6052 │      4212 │ True     │ 8.789357e-07 │
│ MONDO_0019457 │ ENSG00000116478 │ 0.046197 │             2 │         2153 │      9811 │ False    │ 1.127995e-05 │
│ Orphanet_1997 │ ENSG00000125046 │ 0.040210 │             2 │         5811 │      4434 │ True     │ 1.136772e-05 │
│ HP_0002373    │ ENSG00000123607 │ 0.086279 │             2 │        10767 │      9561 │ True     │ 1.171301e-05 │
│ EFO_0005547   │ ENSG00000026025 │ 0.044606 │             4 │          428 │     14422 │ True     │ 2.124696e-05 │
│ HP_0012758    │ ENSG00000132535 │ 0.426218 │             4 │         6210 │      7932 │ True     │ 2.341042e-05 │
│ EFO_0004514   │ ENSG00000010270 │ 0.363329 │             9 │         9578 │     13902 │ False    │ 2.359785e-05 │
│ EFO_0000708   │ ENSG00000171195 │ 0.268628 │             5 │         3693 │      5873 │ False    │ 2.487563e-05 │
│ MONDO_0100330 │ ENSG00000135318 │ 0.004065 │             2 │         7145 │      6197 │ True     │ 2.563954e-05 │
│ EFO_0001054   │ ENSG00000185338 │ 0.025761 │             3 │         3398 │     15839 │ True     │ 2.632733e-05 │
│ …             │ …               │        … │             … │            … │         … │ …        │            … │
└───────────────┴─────────────────┴──────────┴───────────────┴──────────────┴───────────┴──────────┴──────────────┘

Functions to extract, concatenate, and transform features¶

In [8]:
def extract_embedding(mapper, index):
    return mapper.embedding_[index]

def create_feature(gene_index, disease_index, gene_mappers, disease_mappers):
    feature_vec = []

    for disease_mapper in disease_mappers:
        feature_vec.extend(extract_embedding(disease_mapper, disease_index))
    for gene_mapper in gene_mappers:
        feature_vec.extend(extract_embedding(gene_mapper, gene_index))
    
    # Using softmax 
    # (approximating a distrubution over gene/disease features)
    feature_vec = softmax(feature_vec)
    return feature_vec
    

def extract_features(sample_entries, gene_mappers, disease_mappers):
    feature_mat = []
    target_vec = []
    for entry in sample_entries:
        feature_vec = create_feature(entry["geneIndex"], 
                                     entry["diseaseIndex"], 
                                     gene_mappers, disease_mappers)
        feature_mat.append(feature_vec)
        target_vec.append(entry["score"])

    feature_mat = np.asarray(feature_mat)
    feature_mat[np.isnan(feature_mat)] = 0
    return np.asarray(feature_mat), np.asarray(target_vec)

Preparing Train and Test Data¶

In [9]:
test_entries = []
train_entries = []
gene_mappers = [gene_desc_mapper, gene_go_mapper, gene_nucleotide_mapper, gene_protein_mapper]
disease_mappers = [disease_desc_mapper, disease_expression_mapper]

# Pyarrow record batch readers are useful for iterating over large datasets
# It's a bit overkill in this example
# Also useful for online learning algorithms (learning iteratively over small batches)
def to_chunk_iterator(table, chunk_size=100):
    record_reader = table.to_pyarrow_batches(chunk_size=chunk_size)
    while True:
        try:
            yield record_reader.read_next_batch().to_pylist()
        except StopIteration:
            break
            
for chunk in to_chunk_iterator(t_samples, chunk_size=1000):
    test_entries.extend(entry for entry in chunk if entry["is_train"] == False) 
    train_entries.extend([entry for entry in chunk if entry["is_train"] == True])

# In this demo, extracting features and keeping them all in memory
# Not always a good idea depending upon how large they are
train_X, train_Y = \
    extract_features(train_entries, gene_mappers, disease_mappers)
test_X, test_Y = \
    extract_features(test_entries, gene_mappers, disease_mappers)

Defining Model¶

  • Features concatenated and transformed via softmax (approximating joint distribution)
  • A bit hacky in this case -- we're dealing with continuous association scores that range between 0 and 1, rather
  • In this demo, we're really not trying to fine-tune the model or try anything fancy. Lots of room for improvement.
  • Trying out gradient boosted trees using LightGBM
In [10]:
model = LGBMRegressor(objective="cross_entropy", num_iterations=40, learning_rate=0.05)
pipe = Pipeline([('scalar', StandardScaler()),
                 ('model', model)])

Train/Test¶

In [11]:
pipe.fit(train_X, train_Y)
Out[11]:
Pipeline(steps=[('scalar', StandardScaler()),
                ('model',
                 LGBMRegressor(learning_rate=0.05, num_iterations=40,
                               objective='cross_entropy'))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('scalar', StandardScaler()),
                ('model',
                 LGBMRegressor(learning_rate=0.05, num_iterations=40,
                               objective='cross_entropy'))])
StandardScaler()
LGBMRegressor(learning_rate=0.05, num_iterations=40, objective='cross_entropy')
In [12]:
# Quick measures for comparison (based on test predictions)
mse = ((pipe.predict(test_X) - test_Y)**2).mean()
rss = ((pipe.predict(test_X) - test_Y)**2).sum()
tss = ((test_Y - test_Y.mean())**2).sum()
r2 = rss/tss
print("MSE: {}\nR-squared: {}".format(mse, r2))
MSE: 0.016242092156562287
R-squared: 0.8318774619447576

Visualization¶

  • In this example, we're fixing the features of a given disease (Alzheimer's), and making predictions by ranging over target features
  • This allows us to predict a kind of "likelihood" that a target is associated with a given disease
  • Take these results with a grain of salt because we're just exploring the data/models a little in this demo
In [13]:
# In this example, predicting associated targets given Alzheimer's Disease
alz_index = t_disease_df.filter(_.diseaseId == "MONDO_0004975").diseaseIndex.first().execute()

X = []
for gene_index in range(t_gene_df.count().execute()):
    fvec = create_feature(gene_index=gene_index, disease_index=alz_index, disease_mappers=disease_mappers, gene_mappers=gene_mappers)
    X.append(fvec)
X = np.asarray(X)

scores = pipe.predict(X) # we could normalize this to a true distribution, but it wouldn't affect rankings
gene_df["predictedScore"] = scores

# We are going to visualize the results by coloring our gene description embedding plot with them
construct_scatterplot(gene_df, gene_desc_mapper, hover_name="approvedName", 
                      color="predictedScore", hover_data=["location", "id"], size=2.5)

Visualization -- Filtered (Top 1%)¶

  • Below is a filtered version of the previous plot (only considering the top 1% according to predicted probabilities
  • Make note of the "location" variable when you mouseover. These are (one of) the associated sub cellular locations of each of the targets.
In [14]:
alz_index = t_disease_df.filter(_.diseaseId == "MONDO_0004975").select("diseaseId", "diseaseIndex").mutate(score=0).diseaseIndex.first().execute()

X = []

for gene_index in range(t_gene_df.count().execute()):
    fvec = create_feature(gene_index=gene_index, disease_index=alz_index, disease_mappers=disease_mappers, gene_mappers=gene_mappers)
    X.append(fvec)
X = np.asarray(X)

scores = pipe.predict(X)
gene_df["predictedScore"] = scores

construct_scatterplot(gene_df, gene_desc_mapper, hover_name="approvedName", 
                      color="predictedScore", hover_data=["location", "id"], size=2.5,
                      filter = lambda x: x["predictedScore"] > x["predictedScore"].quantile(0.99))

Comparing Predictions vs. Associated Scores¶

Predicted Scores:¶

  • One thing that is clear by looking at the table: a lot of the targets are very, very similar, at least with respect to our choice of features.
  • While not shown in this demo, clustering into groups might help to deduplicate
  • In the previous plot, most of the very similar targets significantly overlap each other, which is why you don't notice the same level of duplication as we do in this dataframe
  • Note that having a lot of "near duplicates" might also affect our training/predictions (unbalanced data), so that's an important consideration
In [15]:
nt = ibis.memtable(gene_df)
In [16]:
(nt
 .order_by(ibis.desc(_.predictedScore))
 .head(60)
).execute()[["predictedScore", "id", "approvedName", "functionDescriptions"]]
Out[16]:
predictedScore id approvedName functionDescriptions
0 0.290234 ENSG00000022355 gamma-aminobutyric acid type A receptor subuni... Ligand-gated chloride channel which is a compo...
1 0.288247 ENSG00000113327 gamma-aminobutyric acid type A receptor subuni... Ligand-gated chloride channel which is a compo...
2 0.288247 ENSG00000145864 gamma-aminobutyric acid type A receptor subuni... Ligand-gated chloride channel which is a compo...
3 0.273529 ENSG00000111886 gamma-aminobutyric acid type A receptor subuni... GABA, the major inhibitory neurotransmitter in...
4 0.271424 ENSG00000268089 gamma-aminobutyric acid type A receptor subuni... GABA, the major inhibitory neurotransmitter in...
5 0.271424 ENSG00000011677 gamma-aminobutyric acid type A receptor subuni... GABA, the major inhibitory neurotransmitter in...
6 0.271424 ENSG00000146276 gamma-aminobutyric acid type A receptor subuni... GABA, the major inhibitory neurotransmitter in...
7 0.271424 ENSG00000145863 gamma-aminobutyric acid type A receptor subuni... GABA, the major inhibitory neurotransmitter in...
8 0.271424 ENSG00000183185 gamma-aminobutyric acid type A receptor subuni... GABA, the major inhibitory neurotransmitter in...
9 0.271424 ENSG00000163288 gamma-aminobutyric acid type A receptor subuni... Component of the heteropentameric receptor for...
10 0.271424 ENSG00000186297 gamma-aminobutyric acid type A receptor subuni... Ligand-gated chloride channel subunit which is...
11 0.271424 ENSG00000102287 gamma-aminobutyric acid type A receptor subuni... GABA, the major inhibitory neurotransmitter in...
12 0.271424 ENSG00000182256 gamma-aminobutyric acid type A receptor subuni... GABA, the major inhibitory neurotransmitter in...
13 0.271424 ENSG00000151834 gamma-aminobutyric acid type A receptor subuni... Ligand-gated chloride channel which is a compo...
14 0.271424 ENSG00000109158 gamma-aminobutyric acid type A receptor subuni... GABA, the major inhibitory neurotransmitter in...
15 0.271424 ENSG00000163285 gamma-aminobutyric acid type A receptor subuni... GABA, the major inhibitory neurotransmitter in...
16 0.271424 ENSG00000094755 gamma-aminobutyric acid type A receptor subuni... GABA, the major inhibitory neurotransmitter in...
17 0.270600 ENSG00000187730 gamma-aminobutyric acid type A receptor subuni... GABA, the major inhibitory neurotransmitter in...
18 0.260201 ENSG00000166206 gamma-aminobutyric acid type A receptor subuni... Ligand-gated chloride channel which is a compo...
19 0.231567 ENSG00000178084 5-hydroxytryptamine receptor 3C This is one of the several different receptors...
20 0.229744 ENSG00000186090 5-hydroxytryptamine receptor 3D This is one of the several different receptors...
21 0.229744 ENSG00000186038 5-hydroxytryptamine receptor 3E This is one of the several different receptors...
22 0.229433 ENSG00000166736 5-hydroxytryptamine receptor 3A This is one of the several different receptors...
23 0.227939 ENSG00000186919 zinc activated ion channel Zinc-activated ligand-gated ion channel. .
24 0.227847 ENSG00000181752 olfactory receptor family 8 subfamily K member 5 Odorant receptor. .
25 0.227847 ENSG00000280090 olfactory receptor family 8 subfamily B member 4 Odorant receptor. .
26 0.226560 ENSG00000089041 purinergic receptor P2X 7 Receptor for ATP that acts as a ligand-gated i...
27 0.224052 ENSG00000149305 5-hydroxytryptamine receptor 3B This is one of the several different receptors...
28 0.223978 ENSG00000108405 purinergic receptor P2X 1 Ligand-gated ion channel with relatively high ...
29 0.223754 ENSG00000176884 glutamate ionotropic receptor NMDA type subunit 1 Component of NMDA receptor complexes that func...
30 0.221723 ENSG00000083454 purinergic receptor P2X 5 Receptor for ATP that acts as a ligand-gated i...
31 0.221490 ENSG00000099957 purinergic receptor P2X 6 Receptor for ATP that acts as a ligand-gated i...
32 0.213317 ENSG00000273079 glutamate ionotropic receptor NMDA type subuni... Component of NMDA receptor complexes that func...
33 0.213285 ENSG00000183454 glutamate ionotropic receptor NMDA type subuni... Component of NMDA receptor complexes that func...
34 0.209648 ENSG00000166862 calcium voltage-gated channel auxiliary subuni... Regulates the trafficking and gating propertie...
35 0.208525 ENSG00000105464 glutamate ionotropic receptor NMDA type subuni... Component of NMDA receptor complexes that func...
36 0.208393 ENSG00000116032 glutamate ionotropic receptor NMDA type subuni... NMDA receptor subtype of glutamate-gated ion c...
37 0.202177 ENSG00000143252 succinate dehydrogenase complex subunit C Membrane-anchoring subunit of succinate dehydr...
38 0.200488 ENSG00000198785 glutamate ionotropic receptor NMDA type subuni... NMDA receptor subtype of glutamate-gated ion c...
39 0.199298 ENSG00000136521 NADH:ubiquinone oxidoreductase subunit B5 Accessory subunit of the mitochondrial membran...
40 0.198087 ENSG00000006116 calcium voltage-gated channel auxiliary subuni... Regulates the trafficking to the somatodendrit...
41 0.197729 ENSG00000151366 NADH:ubiquinone oxidoreductase subunit C2 Accessory subunit of the mitochondrial membran...
42 0.197039 ENSG00000075461 calcium voltage-gated channel auxiliary subuni... Regulates the activity of L-type calcium chann...
43 0.194069 ENSG00000166136 NADH:ubiquinone oxidoreductase subunit B8 Accessory subunit of the mitochondrial membran...
44 0.188456 ENSG00000142408 calcium voltage-gated channel auxiliary subuni... Regulates the activity of L-type calcium chann...
45 0.188368 ENSG00000169432 sodium voltage-gated channel alpha subunit 9 Mediates the voltage-dependent sodium ion perm...
46 0.187995 ENSG00000267855 NADH:ubiquinone oxidoreductase subunit A7 Accessory subunit of the mitochondrial membran...
47 0.184663 ENSG00000139180 NADH:ubiquinone oxidoreductase subunit A9 Accessory subunit of the mitochondrial membran...
48 0.184144 ENSG00000181273 olfactory receptor family 5 subfamily AK member 2 Odorant receptor. .
49 0.184144 ENSG00000150261 olfactory receptor family 8 subfamily K member 1 Odorant receptor. .
50 0.184144 ENSG00000196119 olfactory receptor family 8 subfamily A member 1 Odorant receptor. .
51 0.184144 ENSG00000181767 olfactory receptor family 8 subfamily H member 2 Odorant receptor. .
52 0.184144 ENSG00000280314 olfactory receptor family 8 subfamily K member... Odorant receptor. .
53 0.184144 ENSG00000181371 olfactory receptor family 5 subfamily M member 8 Odorant receptor. .
54 0.184144 ENSG00000186119 olfactory receptor family 5 subfamily D member 18 Odorant receptor. .
55 0.184144 ENSG00000181693 olfactory receptor family 8 subfamily H member 1 Odorant receptor. .
56 0.184144 ENSG00000279395 olfactory receptor family 5 subfamily L member 1 Odorant receptor. .
57 0.184144 ENSG00000167825 olfactory receptor family 5 subfamily I member 1 Odorant receptor. .
58 0.184144 ENSG00000181761 olfactory receptor family 8 subfamily H member 3 Odorant receptor. .
59 0.184144 ENSG00000196578 olfactory receptor family 5 subfamily AC member 2 Odorant receptor. .

Associated Scores:¶

  • Here are targets ranked by score (association score obtained from Open Targets)
  • For comparison, the predictedScore (from the model's prediction; units are different) is shown as an additional column
  • Our goal was not to perfectly approximate the original scores (because otherwise, why not just use those?)
  • Instead we're just seeing what the model predicts as most "likely", according to the features it used to approximate the association scores
In [17]:
(t_associationByOverallDirect
 .filter(_.diseaseId == "MONDO_0004975")
 .order_by(ibis.desc(_.score))
 .head(40)
 .inner_join(nt, _.targetId == nt.id)
 .order_by(ibis.desc(_.score)) # inner join messes up the sort order for some reason
 .execute()
)[["score", "predictedScore", "id", "approvedName", "functionDescriptions"]]
Out[17]:
score predictedScore id approvedName functionDescriptions
0 0.824413 0.091761 ENSG00000142192 amyloid beta precursor protein Functions as a cell surface receptor and perfo...
1 0.634681 0.087795 ENSG00000087085 acetylcholinesterase (Cartwright blood group) Hydrolyzes rapidly the acetylcholine neurotran...
2 0.625412 0.213285 ENSG00000183454 glutamate ionotropic receptor NMDA type subuni... Component of NMDA receptor complexes that func...
3 0.623134 0.223754 ENSG00000176884 glutamate ionotropic receptor NMDA type subunit 1 Component of NMDA receptor complexes that func...
4 0.621476 0.094793 ENSG00000114200 butyrylcholinesterase Esterase with broad substrate specificity. Con...
5 0.613484 0.213317 ENSG00000273079 glutamate ionotropic receptor NMDA type subuni... Component of NMDA receptor complexes that func...
6 0.609383 0.208393 ENSG00000116032 glutamate ionotropic receptor NMDA type subuni... NMDA receptor subtype of glutamate-gated ion c...
7 0.609168 0.090811 ENSG00000137642 sortilin related receptor 1 Sorting receptor that directs several proteins...
8 0.607102 0.083939 ENSG00000164885 cyclin dependent kinase 5 Proline-directed serine/threonine-protein kina...
9 0.604982 0.200488 ENSG00000198785 glutamate ionotropic receptor NMDA type subuni... NMDA receptor subtype of glutamate-gated ion c...
10 0.602390 0.208525 ENSG00000105464 glutamate ionotropic receptor NMDA type subuni... Component of NMDA receptor complexes that func...
11 0.601014 0.174569 ENSG00000161509 glutamate ionotropic receptor NMDA type subuni... Component of NMDA receptor complexes that func...
12 0.587812 0.083939 ENSG00000176749 cyclin dependent kinase 5 regulatory subunit 1 p35 is a neuron specific activator of CDK5. Th...
13 0.586498 0.089855 ENSG00000080815 presenilin 1 Catalytic subunit of the gamma-secretase compl...
14 0.583305 0.104351 ENSG00000064687 ATP binding cassette subfamily A member 7 Catalyzes the translocation of specific phosph...
15 0.565717 0.095840 ENSG00000130203 apolipoprotein E APOE is an apolipoprotein, a protein associati...
16 0.561265 0.091391 ENSG00000118689 forkhead box O3 Transcriptional activator that recognizes and ...
17 0.554858 0.101208 ENSG00000073756 prostaglandin-endoperoxide synthase 2 Dual cyclooxygenase and peroxidase in the bios...
18 0.526961 0.109847 ENSG00000095303 prostaglandin-endoperoxide synthase 1 Dual cyclooxygenase and peroxidase in the bios...
19 0.517934 0.095558 ENSG00000136717 bridging integrator 1 Is a key player in the control of plasma membr...
20 0.514720 0.108816 ENSG00000113161 3-hydroxy-3-methylglutaryl-CoA reductase Catalyzes the conversion of (3S)-hydroxy-3-met...
21 0.497396 0.093533 ENSG00000198087 CD2 associated protein Seems to act as an adapter protein between mem...
22 0.493573 0.073091 ENSG00000203710 complement C3b/C4b receptor 1 (Knops blood group) Membrane immune adherence receptor that plays ...
23 0.490347 0.165674 ENSG00000144285 sodium voltage-gated channel alpha subunit 1 Mediates the voltage-dependent sodium ion perm...
24 0.485417 0.088477 ENSG00000138613 aph-1 homolog B, gamma-secretase subunit Probable subunit of the gamma-secretase comple...
25 0.483862 0.177092 ENSG00000196876 sodium voltage-gated channel alpha subunit 8 Mediates the voltage-dependent sodium ion perm...
26 0.482134 0.099628 ENSG00000114026 8-oxoguanine DNA glycosylase DNA repair enzyme that incises DNA at 8-oxoG r...
27 0.480560 0.103413 ENSG00000142319 solute carrier family 6 member 3 Mediates sodium- and chloride-dependent transp...
28 0.474942 0.119958 ENSG00000204681 gamma-aminobutyric acid type B receptor subunit 1 Component of a heterodimeric G-protein coupled...
29 0.473011 0.111097 ENSG00000136928 gamma-aminobutyric acid type B receptor subunit 2 Component of a heterodimeric G-protein coupled...
30 0.471640 0.169157 ENSG00000136531 sodium voltage-gated channel alpha subunit 2 Mediates the voltage-dependent sodium ion perm...
31 0.470778 0.124441 ENSG00000153253 sodium voltage-gated channel alpha subunit 3 Mediates the voltage-dependent sodium ion perm...
32 0.470188 0.149977 ENSG00000168356 sodium voltage-gated channel alpha subunit 11 This protein mediates the voltage-dependent so...
33 0.470188 0.162964 ENSG00000183873 sodium voltage-gated channel alpha subunit 5 This protein mediates the voltage-dependent so...
34 0.470188 0.129318 ENSG00000185313 sodium voltage-gated channel alpha subunit 10 Tetrodotoxin-resistant channel that mediates t...
35 0.470188 0.142912 ENSG00000136546 sodium voltage-gated channel alpha subunit 7 Mediates the voltage-dependent sodium ion perm...
36 0.470188 0.173257 ENSG00000007314 sodium voltage-gated channel alpha subunit 4 Pore-forming subunit of a voltage-gated sodium...
37 0.470188 0.188368 ENSG00000169432 sodium voltage-gated channel alpha subunit 9 Mediates the voltage-dependent sodium ion perm...
38 0.463312 0.083460 ENSG00000120885 clusterin [Isoform 1]: Functions as extracellular chaper...
39 0.463101 0.107922 ENSG00000103546 solute carrier family 6 member 2 Mediates sodium- and chloride-dependent transp...